/* 
 *    Copyright (C) 2003-2006 Gabest
 *    http://www.gabest.org
 *
 *  This Program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  This Program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *  http://www.gnu.org/copyleft/gpl.html
 *
 *  Based on Intel's AP-942
 *
 */

#include "stdafx.h"
#include <inttypes.h>
#include "libmpeg2.h"
#include "attributes.h"
#include "../../../DSUtil/simd.h"

static const __m128i const_1_16_bytes=_mm_set1_epi16(1);

static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
                const int ebx= edi+eax;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm0,xmm1,xmm2,xmm3;
                movdqu (xmm0,  edx    );
                movdqu (xmm1,  edx+eax);
                movdqu (xmm2,  edx+edi);
                movdqu (xmm3,  edx+ebx);
                movdqa (ecx, xmm0     );
                movdqa (ecx+eax, xmm1 );
                movdqa (ecx+edi, xmm2 );
                movdqa (ecx+ebx, xmm3 );
        }
}

static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
        const int ebx= edi+eax;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128d xmm0,xmm1,xmm2,xmm3;
        movlpd (xmm0, edx);
        movlpd (xmm1, edx+eax);
        movlpd (xmm2, edx+edi);
        movlpd (xmm3, edx+ebx);
        movlpd (ecx, xmm0);
        movlpd (ecx+eax, xmm1 );
        movlpd (ecx+edi, xmm2);
        movlpd (ecx+ebx, xmm3 );
        }
}

static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
        const int ebx= edi+eax;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
        movdqu (xmm0, edx);
        movdqu (xmm1, edx+1);
        movdqu (xmm2, edx+eax);
        movdqu (xmm3, edx+eax+1);
        movdqu (xmm4, edx+edi);
        movdqu( xmm5, edx+edi+1);
        movdqu( xmm6, edx+ebx );
        movdqu( xmm7, edx+ebx+1 );
        pavgb (xmm0, xmm1);
        pavgb (xmm2, xmm3);
        pavgb (xmm4, xmm5);
        pavgb (xmm6, xmm7);
        movdqa (ecx, xmm0);
        movdqa (ecx+eax, xmm2);
        movdqa (ecx+edi, xmm4);
        movdqa (ecx+ebx, xmm6);
        }
}

static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
        const int ebx= edi+eax;
                __m128i xmm0,xmm1,xmm2,xmm3;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
        movlpd (xmm0, edx);
        movlpd (xmm1, edx+1);
        movhpd (xmm0, edx+eax);
        movhpd (xmm1, edx+eax+1);
        movlpd (xmm2, edx+edi);
        movlpd (xmm3, edx+edi+1);
        movhpd (xmm2, edx+ebx);
        movhpd (xmm3, edx+ebx+1);
        pavgb (xmm0, xmm1);
        pavgb (xmm2, xmm3);
        movlpd (ecx, xmm0);
        movhpd (ecx+eax, xmm0);
        movlpd (ecx+edi, xmm2);
        movhpd (ecx+ebx, xmm2);
        }
}

static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
        const int ebx= edi+eax;
                __m128i xmm0;
                movdqu (xmm0, edx);
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm1,xmm2,xmm3,xmm4;
        movdqu (xmm1, edx+eax);
        movdqu (xmm2, edx+edi );
        movdqu (xmm3, edx+ebx );
        movdqu (xmm4, edx+edi*2 );
        pavgb (xmm0, xmm1 );
        pavgb (xmm1, xmm2 );
        pavgb (xmm2, xmm3 );
        pavgb (xmm3, xmm4 );
        movdqa (ecx, xmm0 );
        movdqa (ecx+eax, xmm1 );
        movdqa (ecx+edi, xmm2 );
        movdqa (ecx+ebx, xmm3 );
        movdqa (xmm0, xmm4 );
        }
}

static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi)
{
                const int edi= eax+eax;
        const int ebx= edi+eax;
                __m128i xmm0;
        movlpd (xmm0, edx);

        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4){
                __m128i xmm1,xmm2,xmm3,xmm4;
        movlpd (xmm1, edx+eax );
        movlpd (xmm2, edx+edi );
        movlpd (xmm3, edx+ebx );
        movlpd (xmm4, edx+edi*2 );
        pavgb (xmm0, xmm1 );
        pavgb (xmm1, xmm2);
        pavgb (xmm2, xmm3 );
        pavgb (xmm3, xmm4 );
        movlpd (ecx, xmm0 );
        movlpd (ecx+eax, xmm1 );
        movlpd (ecx+edi, xmm2 );
        movlpd (ecx+ebx, xmm3 );
        movdqa (xmm0, xmm4 );
        }
}

static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref ;
                uint8_t  *ecx= dest;
                int eax= stride;
                int esi= height;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm1,xmm4,xmm5,xmm2,xmm3;
                movdqa (xmm7, const_1_16_bytes );
                movdqu (xmm0, edx );
                movdqu (xmm1, edx+1 );
        for (;esi;edx+= edi,ecx+= edi,esi-= 2)
         {
                movdqu (xmm2, edx+eax );
                movdqu (xmm3, edx+eax+1 );
                movdqu (xmm4, edx+edi );
                movdqu (xmm5, edx+edi+1 );
                pavgb (xmm0, xmm1 );
                pavgb (xmm2, xmm3 );
                movdqa( xmm1, xmm5 );
                pavgb (xmm5, xmm4 );
                psubusb( xmm2, xmm7 );
                pavgb (xmm0, xmm2 );
                pavgb (xmm2, xmm5);
                movdqa (ecx, xmm0);
                movdqa (xmm0, xmm4);
                movdqa (ecx+eax, xmm2);
        }
}

static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int eax= stride;
                int esi= height;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm2,xmm1,xmm3,xmm4,xmm5;
        movdqa (xmm7, const_1_16_bytes);
        movlpd (xmm0, edx);
        movlpd (xmm1, edx+1);
        for (;esi;edx+= edi,ecx+= edi,esi-= 2)
         {
        movlpd (xmm2, edx+eax);
        movlpd (xmm3, edx+eax+1);
        movlpd (xmm4, edx+edi);
        movlpd (xmm5, edx+edi+1);
        pavgb (xmm0, xmm1 );
        pavgb (xmm2, xmm3 );
        movdqa( xmm1, xmm5 );
        pavgb (xmm5, xmm4 );
        psubusb( xmm2, xmm7 );
        pavgb (xmm0, xmm2 );
        pavgb (xmm2, xmm5);
        movlpd (ecx, xmm0);
        movdqa (xmm0, xmm4);
        movlpd (ecx+eax, xmm2);
        }
}

static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
                __m128i xmm0,xmm1,xmm2,xmm3;
        movdqu (xmm0, edx);
        movdqu (xmm1, edx+eax );
        movdqu (xmm2, edx+edi);
        movdqu (xmm3, edx+ebx );
        pavgb (xmm0, ecx);
        pavgb (xmm1, ecx+eax);
        pavgb (xmm2, ecx+edi);
        pavgb (xmm3, ecx+ebx);
        movdqa (ecx, xmm0);
        movdqa (ecx+eax, xmm1 );
        movdqa (ecx+edi, xmm2);
        movdqa (ecx+ebx, xmm3 );
         }
}

static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

                __m128i xmm0,xmm1,xmm2,xmm3;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
        movlpd (xmm0, edx);
        movhpd (xmm0, edx+eax );
        movlpd (xmm2, edx+edi);
        movhpd (xmm2, edx+ebx );
        movlpd (xmm1, ecx);
        movhpd (xmm1, ecx+eax);
        movlpd (xmm3, ecx+edi);
        movhpd (xmm3, ecx+ebx);
        pavgb (xmm0, xmm1);
        pavgb (xmm2, xmm3);
        movlpd (ecx, xmm0);
        movhpd (ecx+eax, xmm0);
        movlpd (ecx+edi, xmm2);
        movhpd (ecx+ebx, xmm2);
         }
}

static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
        movdqu (xmm0, edx);
        movdqu (xmm1, edx+1);
        movdqu (xmm2, edx+eax);
        movdqu (xmm3, edx+eax+1);
        movdqu (xmm4, edx+edi);
        movdqu (xmm5, edx+edi+1);
        movdqu (xmm6, edx+ebx);
        movdqu (xmm7, edx+ebx+1);
        pavgb (xmm0, xmm1);
        pavgb (xmm2, xmm3);
        pavgb (xmm4, xmm5);
        pavgb (xmm6, xmm7);
        pavgb (xmm0, ecx);
        pavgb (xmm2, ecx+eax);
        pavgb (xmm4, ecx+edi);
        pavgb (xmm6, ecx+ebx);
        movdqa (ecx, xmm0);
        movdqa (ecx+eax, xmm2);
        movdqa (ecx+edi, xmm4);
        movdqa (ecx+ebx, xmm6);
         }
}

static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;

                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5;
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
        movlpd (xmm0, edx);
        movlpd (xmm1, edx+1);
        movhpd (xmm0, edx+eax);
        movhpd (xmm1, edx+eax+1);
        movlpd (xmm2, edx+edi);
        movlpd (xmm3, edx+edi+1);
        movhpd (xmm2, edx+ebx);
        movhpd (xmm3, edx+ebx+1);
        movlpd (xmm4, ecx);
        movhpd (xmm4, ecx+eax);
        movlpd (xmm5, ecx+edi);
        movhpd (xmm5, ecx+ebx);
        pavgb (xmm0, xmm1);
        pavgb (xmm2, xmm3);
        pavgb (xmm0, xmm4);
        pavgb (xmm2, xmm5);
        movlpd (ecx, xmm0);
        movhpd (ecx+eax, xmm0);
        movlpd (ecx+edi, xmm2);
        movhpd (ecx+ebx, xmm2);
        }
}

static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;
                __m128i xmm0,xmm1,xmm2,xmm3,xmm4;

                movdqu (xmm0,edx);
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
        movdqu (xmm1, edx+eax );
        movdqu (xmm2, edx+edi );
        movdqu (xmm3, edx+ebx );
        movdqu (xmm4, edx+edi*2 );
        pavgb (xmm0, xmm1 );
        pavgb (xmm1, xmm2 );
        pavgb (xmm2, xmm3 );
        pavgb (xmm3, xmm4 );
        pavgb (xmm0, ecx);
        pavgb (xmm1, ecx+eax );
        pavgb (xmm2, ecx+edi);
        pavgb (xmm3, ecx+ebx );
        movdqa (ecx, xmm0 );
        movdqa (ecx+eax, xmm1 );
        movdqa (ecx+edi, xmm2 );
        movdqa (ecx+ebx, xmm3 );
        movdqa (xmm0, xmm4 );
         }
}

static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                int ebx= edi+eax;
                __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5;
                movhpd (xmm0, edx );
                movlpd (xmm0, edx+eax );
        for (;esi;edx+=edi*2,ecx+=edi*2,esi-=4)
         {
        movlhps (xmm1, xmm0);
        movlpd (xmm1, edx+edi );
        movlhps (xmm2, xmm1);
        movlpd (xmm2, edx+ebx );
        movlhps (xmm3, xmm2);
        movlpd (xmm3, edx+edi*2 );
        movhpd (xmm4, ecx );
        movlpd (xmm4, ecx+eax );
        movhpd (xmm5, ecx+edi );
        movlpd (xmm5, ecx+ebx );
        pavgb (xmm0, xmm1 );
        pavgb (xmm2, xmm3);
        pavgb (xmm0, xmm4);
        pavgb (xmm2, xmm5);
        movhpd (ecx, xmm0 );
        movlpd (ecx+eax, xmm0 );
        movhpd (ecx+edi, xmm2 );
        movlpd (ecx+ebx, xmm2);
        movdqa (xmm0, xmm3 );
         }
}

static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5;
                movdqa (xmm7, const_1_16_bytes );
                movdqu (xmm0, edx );
                movdqu (xmm1, edx+1 );
        for (;esi;edx+=edi,ecx+=edi,esi-=2)
          {
        movdqu (xmm2, edx+eax );
        movdqu (xmm3, edx+eax+1 );
        movdqu (xmm4, edx+edi );
        movdqu (xmm5, edx+edi+1 );
        pavgb (xmm0, xmm1 );
        pavgb (xmm2, xmm3 );
        movdqa (xmm1, xmm5 );
        pavgb (xmm5, xmm4 );
        psubusb (xmm2, xmm7 );
        pavgb (xmm0, xmm2 );
        pavgb (xmm2, xmm5);
        pavgb (xmm0, ecx );
        pavgb (xmm2, ecx+eax);
        movdqa (ecx, xmm0);
        movdqa (xmm0, xmm4);
        movdqa (ecx+eax, xmm2);
         }
}

static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
{
                const uint8_t *edx= ref;
                uint8_t *ecx= dest;
                int esi= height;
                int eax= stride;
                int edi= eax+eax;
                __m128i xmm7,xmm0,xmm2,xmm1,xmm3,xmm4;
                movdqa (xmm7, const_1_16_bytes );
                movhpd (xmm0, edx );
                movlpd (xmm0, edx+eax );
                movhpd (xmm2, edx+1 );
                movlpd (xmm2, edx+eax+1 );
        for (;esi;edx+=edi,ecx+=edi,esi-=2)
         {

                movhpd (xmm1, edx+eax );
                movlpd (xmm1, edx+edi );
                movhpd (xmm3, edx+eax+1 );
                movlpd (xmm3, edx+edi+1 );
                pavgb (xmm0, xmm1 );
                pavgb (xmm2, xmm3 );
                psubusb( xmm0, xmm7 );
                pavgb (xmm0, xmm2 );
                movhpd( xmm4, ecx);
                movlpd( xmm4, ecx+eax);
                pavgb (xmm0, xmm4 );
                movhpd (ecx, xmm0 );
                movlpd (ecx+eax, xmm0 );
                movdqa (xmm0, xmm1 );
                movdqa (xmm2, xmm3 );
        }
}

mpeg2_mc_t mpeg2_mc_sse2 =
{
        {MC_put_o_16_sse2, MC_put_x_16_sse2, MC_put_y_16_sse2, MC_put_xy_16_sse2,
        MC_put_o_8_sse2,  MC_put_x_8_sse2,  MC_put_y_8_sse2,  MC_put_xy_8_sse2},
        {MC_avg_o_16_sse2, MC_avg_x_16_sse2, MC_avg_y_16_sse2, MC_avg_xy_16_sse2,
        MC_avg_o_8_sse2,  MC_avg_x_8_sse2,  MC_avg_y_8_sse2,  MC_avg_xy_8_sse2}
};
